/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Generate measures of exposure to routine jobs and wages by percentile of the wage distribution.
Census data saved in an external folder due to its size ($census_data) and not available with this replication kit.
Contact Pascual Restrepo (pascual@bu.edu) for a link to this file. 
5.7.2019 (revised 5.8.2020)
Pascual Restrepo
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

global census_data "C:/Users/Pascual Restrepo/Dropbox/Research/robots_and_jobs/raw_data/ipums_census_data"
global project     "C:\Users\Pascual Restrepo\Dropbox\Research\wealth_inequality\AutomationInequality\replication_mrr_may_2020\s2a_calibration_model"

* sample selection criteria *
global sample_selection (gqtyped>=100 & gqtyped<=499) ///
                        | (statefip==2 | statefip==15) ///
						| (age<25 | age>55) ///
						| perwt==0

*************************************************************************************
***Step 1: covert data on occupational characteristics to the occ1990dd_acs system***
*************************************************************************************
use "${project}/raw_data/occ1990dd/occ1990dd_list.dta", clear

* occupational characteristics from David Autor and David Dorn based on DOT (available from David Dorn's website)---excludes agricultural jobs *
merge 1:1 occ1990dd using "${project}/raw_data/occ1990dd/occ1990dd_task_dot.dta", assert(1 3) nogenerate

* measure 1: top 33% occupations with highest routine content *
rename R33_3a routine_dot 

* occupational characteristics from David Autor and Daron Acemoglu based on ONET (available from David Autor's website)---excludes some supervising jobs *
merge 1:1 occ1990dd using "${project}/raw_data/occ1990dd/occ1990dd_task_onet.dta", assert(1 3) nogenerate

* measure 2: top 33% jobs with highes routine content according to ONET *
gen routine_intensity=r_cog+r_man-(1/6)*(nr_cog_anal+ nr_cog_pers+ r_cog+ r_man+ nr_man_phys+ nr_man_pers)
* note: we also explored the alternative specification (r_cog+r_man)/2-((nr_cog_anal+ nr_cog_pers+ nr_man_phys+ nr_man_pers))/4
_pctile routine_intensity [w=lswt], p(66)
gen routine_onet=(routine_intensity>r(r1)) if routine_intensity!=. & lswt!=.

* clean and save *
keep occ1990dd occ1990dd_desc routine_onet routine_dot
tempfile occdata
save `occdata', replace


*************************************************************
***Step 2: Compile census wage data for 1980 2000 and 2014***
*************************************************************
foreach year in 1980 2000 2014{

* use Census data * 
if `year'!=2014{
use if year==`year' using "${census_data}/ipums_census_full.dta", clear
}
else if `year'==2014{
use if year==2016 & multyear!=.  using "${census_data}/ipums_census_full.dta", clear
}

* sample selection *
drop if ${sample_selection}

* obtain hourly and weekly wages *
run "${project}/do_files/subfile to clean wages.do"
gen ipums_hrwage=hrwage if hrwage!=. /* Real wages */
gen ipums_wkwage=wkwage if wkwage!=. /* Real wages */
gen ipums_yrwage=yrwage if yrwage!=. /* Real wages */
keep if ipums_hrwage!=.

* clean *
keep sex ipums_hrwage ipums_wkwage ipums_yrwage perwt lswts 
sort ipums_hrwage
xtile qinc=ipums_hrwage [w=perwt], nquantiles(100)
gen ipums_ln_hrwage=ln(ipums_hrwage)

* collapse mean wages by percentile *
collapse (mean) ipums_*wage [iw=lswts], fast by(qinc)	

* intra-polate so that wages are non missing *
set obs 200
forvalues j=1(1)100{
quietly: count if qinc==`j'
if r(N)==0{
local index=100+`j'
replace qinc=`j' in `index'
}
}
drop if qinc==.
unique qinc
sort qinc

foreach var in hr wk yr{
* fill missing values *
rangestat (mean) ipums_`var'wage, interval(qinc -1 1)
replace ipums_`var'wage=ipums_`var'wage_mean if ipums_`var'wage==.
drop ipums_`var'wage_mean
assert ipums_`var'wage!=.
* extrapolate top wages (differences in top censoring in Census) *
gen ln_wage=ln(ipums_`var'wage)
gen lnrank=ln(100.5-qinc)
reg ln_wage lnrank if qinc>=90 & qinc<=99
predict wpred, xb
replace ln_wage=wpred if qinc>99
replace ipums_`var'wage=exp(wpred) if qinc>99
drop ln* wpred
}

* fill missing values *
rangestat (mean) ipums_ln_hrwage, interval(qinc -1 1)
replace ipums_ln_hrwage=ipums_ln_hrwage_mean if ipums_ln_hrwage==.
drop ipums_ln_hrwage_mean
assert ipums_ln_hrwage!=.
* extrapolate top wages (differences in top censoring in Census) *
gen ln_wage=ipums_ln_hrwage
gen lnrank=ln(100.5-qinc)
reg ln_wage lnrank if qinc>=90 & qinc<=99
predict wpred, xb
replace ln_wage=wpred if qinc>99
replace ipums_ln_hrwage=wpred if qinc>99
drop ln* wpred

save "${project}/output/WAGEDIST_`year'.dta", replace
}

*************************************************************************
***Step 3: Compute observed change in wages (for figure in the paper) ***
*************************************************************************
use "${project}/output/WAGEDIST_1980.dta", clear
rename ipums_hrwage ipums_hrwage_1980
rename ipums_ln_hrwage ipums_ln_hrwage_1980

merge 1:1 qinc using "${project}/output/WAGEDIST_2014.dta", nogenerate
rename ipums_hrwage ipums_hrwage_2014
rename ipums_ln_hrwage ipums_ln_hrwage_2014

* smooth data using moving average and fill in gaps *
foreach var in ipums_hrwage_1980 ipums_hrwage_2014 ipums_ln_hrwage_1980 ipums_ln_hrwage_2014 {
rangestat (mean) `var', interval(qinc -5 5)
replace `var'=`var'_mean if `var'==.
}

* compute observed wages and save data *
gen change_wage_obs=ipums_ln_hrwage_2014_mean-ipums_ln_hrwage_1980_mean
keep qinc change_wage_obs
export delimited using "${project}/output/observed_wages.csv", replace

*************************************************************
***Step 4: Compute share of wages obtained in routine jobs***
*************************************************************
foreach year in 1980 2000{

* use Census data * 
use if year==`year' using "${census_data}/ipums_census_full.dta", clear
drop if perwt==0

* sample selection *
drop if ${sample_selection}

* compute hourly and weekly wages *
run "${project}/do_files/subfile to clean wages.do"
gen ipums_hrwage=hrwage if hrwage!=. /*Real wages */
gen ipums_wkwage=wkwage if wkwage!=. /*Real wages */
gen ipums_yrwage=yrwage if yrwage!=. /*Real wages */
keep if ipums_hrwage!=.

* clean *
keep sex ipums_hrwage ipums_wkwage ipums_yrwage perwt lswts  occ 

***Match occupational characteristics***
***Create consistent occupational groups***
merge m:1 occ using "${project}/raw_data/occ1990dd/occ`year'_occ1990dd.dta", assert(2 3) keep(3) nogenerate /*Use David Dorn's crosspath to merge occ1990dd occupational codes*/
merge m:1 occ1990dd using `occdata', keep(1 3) nogenerate

* generate percentiles of wage distribution *
sort ipums_hrwage
xtile qinc=ipums_hrwage [w=perwt], nquantiles(100)

* aggregate across percentiles using labor income as weights *
gen full_wt=lswts*ipums_hrwage 

gen wages_dot =full_wt if routine_dot!=.
gen wages_onet=full_wt if routine_onet!=.

sum routine_dot  [w=full_wt]
sum routine_onet [w=full_wt]

* collapse data to percentiles *
collapse (rawsum) wages_dot wages_onet (mean) routine_* [w=full_wt], by(qinc)

* intra-polate so that routine shares are non missing *
set obs 200
forvalues j=1(1)100{
quietly: count if qinc==`j'
if r(N)==0{
local index=100+`j'
replace qinc=`j' in `index'
}
}
drop if qinc==.
unique qinc
sort qinc

* smooth data using moving average*
rangestat (mean) routine_*, interval(qinc -5 5)
replace routine_dot=routine_dot_mean
replace routine_onet=routine_onet_mean
drop *_mean
assert routine_dot!=. & routine_onet!=.

* aggregate shares *
sum routine_dot [w=wages_dot]
scalar aggregate_dot=r(mean)

sum routine_onet [w=wages_onet]
scalar aggregate_onet=r(mean)

* generate omegas *
gen omega_dot =routine_dot/aggregate_dot
gen omega_onet=routine_onet/aggregate_onet

* tests and checks *
sum omega_dot  [w=wages_dot]
sum omega_onet [w=wages_onet]

* save routine shares *
save "${project}/output/OMEGA_`year'.dta", replace
}
